In [ ]:
import pandas as pd

# Test
df = pd.DataFrame({"Yes": [30, 21], "No": [12, 20]}, index=["Poll 1", "Poll 2"])
print(df)
        Yes  No
Poll 1   30  12
Poll 2   21  20
In [ ]:
# Reading in salary data from kaggle on Software Engineer positions

salaries = pd.read_csv("Software Engineer Salaries.csv")
salaries.shape
salaries.Company
# is the same as salaries["Company"]

# Drop all rows were an entry is NaN

salaries.dropna().shape
salaries = salaries.dropna()
In [ ]:
import plotly.express as px
import plotly.io as pio

pio.renderers.default = "notebook" 

print(salaries.head())

# How many different companies are contained in the dataset?
print(salaries.nunique(axis=0)["Company"])

# How many different job titles are contained in the dataset?
print(salaries.nunique(axis=0)["Job Title"])

# What is the distribution of scores? Add the mean as a vertical line.
fig = px.histogram(salaries, x="Company Score")
fig.add_vline(x=salaries["Company Score"].mean())
                         Company  Company Score  \
0                       ViewSoft            4.8   
1                        Workiva            4.3   
2     Garmin International, Inc.            3.9   
3                       Snapchat            3.5   
4  Vitesco Technologies Group AG            3.1   

                                           Job Title         Location Date  \
0                                  Software Engineer     Manassas, VA   8d   
1                          Software Support Engineer           Remote   2d   
2                               C# Software Engineer         Cary, NC   2d   
3  Software Engineer, Fullstack, 1+ Years of Expe...  Los Angeles, CA   2d   
4                                  Software Engineer       Seguin, TX   2d   

                          Salary  
0   $68K - $94K (Glassdoor est.)  
1   $61K - $104K (Employer est.)  
2  $95K - $118K (Glassdoor est.)  
3   $97K - $145K (Employer est.)  
4  $85K - $108K (Glassdoor est.)  
578
468
In [ ]:
import numpy as np 
import plotly.express as px
import plotly.io as pio

pio.renderers.default = "notebook" 

# What is the score of each company? And sort from best to worst by resetting index 0,1,2,3,... and drop index column - save under new variable scores

# What is the minimum and maximum salary per positon? Data cleaning requiered since salary column contains string + needs to differentiate for case where only one value is given
split_salary = salaries.Salary.apply(lambda x: pd.Series(x.split("-")))

split_salary = split_salary.apply(lambda x: x.str.extract(r"(\d+)", expand=False)) 
split_salary = split_salary.apply(pd.to_numeric) 
split_salary = split_salary.rename(columns={0: "Minimum", 1: "Maximum"})  

split_salary["Maximum"] = split_salary["Maximum"].astype(float)  
split_salary["Minimum"] = split_salary["Minimum"].astype(float)  
split_salary["Maximum"] = np.where(pd.isnull(split_salary["Maximum"]), split_salary["Minimum"], split_salary["Maximum"])  
split_salary = pd.concat([salaries["Company"], salaries["Company Score"], salaries["Job Title"], split_salary], axis=1)

px.scatter(x=split_salary["Company Score"], y=split_salary["Minimum"], trendline="ols")
px.scatter(x=split_salary["Company Score"], y=split_salary["Maximum"], trendline="lowess")

# There seems to be no correlation between how well companies are scored and how high their salaries are.
In [ ]:
import plotly.express as px
import plotly.io as pio

pio.renderers.default = "notebook" 

# We could ask ourselves whether salaries differ substantially between different states.

split_location = salaries.Location.apply(lambda x: pd.Series(x.split(",")))

split_location = split_location.rename(columns={0: "City", 1: "State"})  

split_location = pd.concat([split_salary, split_location], axis=1)

split_location = split_location.dropna()

px.box(x=split_location["State"], y=split_location["Maximum"])
px.box(x=split_location["State"], y=split_location["Minimum"])

average_salaries_per_state = pd.DataFrame(split_location.groupby("State")["Minimum"].mean()).reset_index()

px.choropleth(locations=average_salaries_per_state["State"].str.strip(), locationmode="USA-states", color=average_salaries_per_state["Minimum"], scope="usa")
In [ ]:
import statsmodels.api as sm
import pandas as pd

# Can we model minimum salary based on US state?

split_location_encoded = pd.get_dummies(split_location, columns=["State"])  
  
Y = split_location_encoded["Minimum"]  
X = split_location_encoded.drop(["Minimum", "Maximum", "Company", "Company Score", "Job Title", "City"], axis=1)  
X = X.astype(int)

X = sm.add_constant(X)  
  
# Conduct a multiple linear regression analysis
model = sm.OLS(Y, X).fit()  
  
print(model.summary()) 

# > Looks like there is no state that significantly predicts salary coefficient added to the model intercept
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                Minimum   R-squared:                       0.055
Model:                            OLS   Adj. R-squared:                 -0.013
Method:                 Least Squares   F-statistic:                    0.8039
Date:                Fri, 16 Aug 2024   Prob (F-statistic):              0.811
Time:                        13:52:55   Log-Likelihood:                -3216.5
No. Observations:                 637   AIC:                             6521.
Df Residuals:                     593   BIC:                             6717.
Df Model:                          43                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        101.0912      3.021     33.458      0.000      95.157     107.025
State_ AL     12.4088     19.352      0.641      0.522     -25.598      50.416
State_ AR     -8.5912     27.201     -0.316      0.752     -62.013      44.830
State_ AZ     -8.0912     11.916     -0.679      0.497     -31.494      15.312
State_ CA      1.8117      4.829      0.375      0.708      -7.672      11.296
State_ CO      0.2588      9.067      0.029      0.977     -17.548      18.065
State_ CT      5.3373     14.762      0.362      0.718     -23.655      34.329
State_ DC     16.0088     12.461      1.285      0.199      -8.464      40.482
State_ DE    -11.7579     22.278     -0.528      0.598     -55.511      31.995
State_ FL     -1.7579      7.954     -0.221      0.825     -17.378      13.863
State_ GA     -1.4391      8.525     -0.169      0.866     -18.182      15.303
State_ HI     33.9088     38.349      0.884      0.377     -41.407     109.225
State_ IA     -7.8412     19.352     -0.405      0.685     -45.848      30.166
State_ ID     -7.5912     19.352     -0.392      0.695     -45.598      30.416
State_ IL     -6.3956      8.525     -0.750      0.453     -23.138      10.347
State_ IN     11.2421     22.278      0.505      0.614     -32.511      54.995
State_ KS     36.9088     19.352      1.907      0.057      -1.098      74.916
State_ KY     53.4088     27.201      1.964      0.050      -0.013     106.830
State_ MA      7.6655      6.973      1.099      0.272      -6.030      21.361
State_ MD      6.0827      5.505      1.105      0.270      -4.730      16.895
State_ MI     -1.0386      9.276     -0.112      0.911     -19.257      17.180
State_ MN     23.3532     13.096      1.783      0.075      -2.368      49.074
State_ MO    -12.0912     38.349     -0.315      0.753     -87.407      63.225
State_ MS    -34.4246     22.278     -1.545      0.123     -78.177       9.328
State_ NC      3.8179     11.916      0.320      0.749     -19.585      27.221
State_ NE      5.9088     27.201      0.217      0.828     -47.513      59.330
State_ NH     13.9088     27.201      0.511      0.609     -39.513      67.330
State_ NJ      8.3298      9.276      0.898      0.370      -9.889      26.548
State_ NM    -31.5912     27.201     -1.161      0.246     -85.013      21.830
State_ NY     -7.5528      8.083     -0.934      0.350     -23.428       8.323
State_ OH     10.5451     11.916      0.885      0.377     -12.858      33.948
State_ OK     11.2421     22.278      0.505      0.614     -32.511      54.995
State_ OR      1.1088     17.362      0.064      0.949     -32.989      35.207
State_ PA     17.7323      9.752      1.818      0.070      -1.420      36.885
State_ PR      5.9088     27.201      0.217      0.828     -47.513      59.330
State_ RI     -2.5912     27.201     -0.095      0.924     -56.013      50.830
State_ SC    -23.4246     22.278     -1.051      0.293     -67.177      20.328
State_ TN    -29.0912     38.349     -0.759      0.448    -104.407      46.225
State_ TX      3.9564      6.628      0.597      0.551      -9.060      16.973
State_ UT     10.9088     15.897      0.686      0.493     -20.312      42.130
State_ VA    -10.6047      6.973     -1.521      0.129     -24.300       3.091
State_ VT    -17.0912     22.278     -0.767      0.443     -60.844      26.661
State_ WA      6.9326      6.628      1.046      0.296      -6.084      19.949
State_ WI     -3.5458     11.916     -0.298      0.766     -26.949      19.857
State_ WV     18.9088     38.349      0.493      0.622     -56.407      94.225
==============================================================================
Omnibus:                      220.217   Durbin-Watson:                   1.924
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1002.020
Skew:                           1.509   Prob(JB):                    2.59e-218
Kurtosis:                       8.352   Cond. No.                     1.16e+15
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 5.07e-28. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
In [ ]:
salaries.loc[salaries["Job Title"] == "Software Engineer",]
split_salary.loc[(split_salary["Job Title"] == "Software Engineer") & (split_salary["Minimum"] > 40),]
split_location.loc[split_location.State.isin([" AR"]),]

# Whats the position with the highest minimum salary per company? 
split_salary.groupby(["Company"]).apply(lambda split_salary: split_salary.loc[split_salary.Minimum.idxmax()])
# > Here, per group the index (row number) of the highest minimum salary position is used to return the row of that group , then all rows for each group are returend in one dataframe containing company names only once

# How many different listings posted on different dates for each Job title were posted per company?
num_listings_per_position = salaries.groupby(["Company", "Job Title"]).Date.agg([len])
#print(num_listings_per_position)

# This pd DataFrame uses multi index:
mi = num_listings_per_position.index
type(mi)

num_listings_per_position.sort_values(by="len", ascending=False)
# > Microsoft posted 3 different Software Engineer positions in the past few days, Boeing has posted 2... etc...

# We can also sort by the index (Company and Job Title first letter)
num_listings_per_position.sort_index()
C:\Users\klosed\AppData\Local\Temp\ipykernel_20556\2452321493.py:6: DeprecationWarning:

DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.

Out[ ]:
len
Company Job Title
3 Reasons Consulting Full Stack Software Engineer 1
360care Sr Associate Software Engineer, Database as a Service 1
3R Info Software Engineer 2
7-Eleven, Inc. Software Engineer 1
9th networks Inc Software Engineer IV 1
... ... ...
f5 Senior Software Engineer 1
Software Engineer III 1
iconectiv, LLC. Full Stack Software Engineer 1
kasmo cloud solutions Software Engineers 1
thatgamecompany Senior Software Engineer 1

738 rows × 1 columns

In [ ]:
import plotly.express as px
import plotly.io as pio

pio.renderers.default = "notebook" 

titles = pd.DataFrame(salaries["Job Title"].value_counts()).reset_index()

titles_dict = titles.set_index("Job Title")["count"].to_dict()

fig = px.bar(titles["count"], hover_name=titles["Job Title"])
fig.show()

# Suppose we would like to add the count of each job title to the original data frame

salaries["Count"] = salaries["Job Title"].map(titles_dict)
print(salaries)
                           Company  Company Score  \
0                         ViewSoft            4.8   
1                          Workiva            4.3   
2       Garmin International, Inc.            3.9   
3                         Snapchat            3.5   
4    Vitesco Technologies Group AG            3.1   
..                             ...            ...   
759                         OpenAI            3.1   
760                     NCR Atleos            3.9   
761  Electronic Warfare Associates            3.8   
762                   BillGO, Inc.            3.3   
763                        Genesys            3.6   

                                             Job Title           Location  \
0                                    Software Engineer       Manassas, VA   
1                            Software Support Engineer             Remote   
2                                 C# Software Engineer           Cary, NC   
3    Software Engineer, Fullstack, 1+ Years of Expe...    Los Angeles, CA   
4                                    Software Engineer         Seguin, TX   
..                                                 ...                ...   
759                   Apprentice ETL Software Engineer       Elkridge, MD   
760  Software Engineer, Tool & Infrastructure, Reli...      Palo Alto, CA   
761        Software Engineer, Engineering Acceleration  San Francisco, CA   
762                  ATM Software Engineer (Level III)         Frisco, TX   
763                                  Software Engineer   Mount Laurel, NJ   

     Date                         Salary  Count  
0      8d   $68K - $94K (Glassdoor est.)    139  
1      2d   $61K - $104K (Employer est.)      2  
2      2d  $95K - $118K (Glassdoor est.)      2  
3      2d   $97K - $145K (Employer est.)      1  
4      2d  $85K - $108K (Glassdoor est.)    139  
..    ...                            ...    ...  
759  30d+  $160K - $385K (Employer est.)      1  
760    1d          $222K (Employer est.)      1  
761    5d  $82K - $118K (Glassdoor est.)      1  
762  30d+  $69K - $107K (Glassdoor est.)      1  
763  30d+   $95K - $143K (Employer est.)    139  

[753 rows x 7 columns]